## packages: remove or add your necessary packages

required_packages <- c("tidyverse", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools")

library(ggplot2)    # CRAN v3.3.6
library(colorspace) # CRAN v2.0-3
library(here)       # CRAN v1.0.1
library(dplyr)      # CRAN v1.0.10
library(janitor)    # CRAN v2.1.0
library(gt)         # CRAN v0.5.0
library(tidyr)      # CRAN v1.2.1
library(readr)      # CRAN v2.1.3
library(stringr)    # CRAN v1.4.1
library(tidytext)
library(ggalt)
library(forcats)
library(lubridate)

# for(i in required_packages) { 
# if(!require(i, character.only = T)) {
# 
# #  if package is not existing, install then load the package
# install.packages(i, dependencies = T)
# require(i, character.only = T)
# }
# }


## save plots?
save <- TRUE
#save <- FALSE

## quality of png's
dpi <- 750

## font adjust; please adjust to client´s website
#extrafont::loadfonts(device = "win", quiet = TRUE)
#font_add_google("Montserrat", "Montserrat")
# font_add_google("Overpass", "Overpass")
# font_add_google("Overpass Mono", "Overpass Mono")



## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))


theme_update(plot.margin = margin(30, 30, 30, 30),
             plot.background = element_rect(color = "white",
                                            fill = "white"),
             plot.title = element_text(size = 20,
                                       face = "bold",
                                       lineheight = 1.05,
                                       hjust = .5,
                                       margin = margin(10, 0, 25, 0)),
             plot.title.position = "plot",
             plot.caption = element_text(color = "grey40",
                                         size = 9,
                                         margin = margin(20, 0, -20, 0)),
             plot.caption.position = "plot",
             axis.line.x = element_line(color = "black",
                                        size = .8),
             axis.line.y = element_line(color = "black",
                                        size = .8),
             axis.title.x = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(t = 20)),
             axis.title.y = element_text(size = 16,
                                         face = "bold",
                                         margin = margin(r = 20)),
             axis.text = element_text(size = 11,
                                      color = "black",
                                      face = "bold"),
             axis.text.x = element_text(margin = margin(t = 10)),
             axis.text.y = element_text(margin = margin(r = 10)),
             axis.ticks = element_blank(),
             panel.grid.major.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.major.y = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.x = element_line(size = .6,
                                               color = "#eaeaea",
                                               linetype = "solid"),
             panel.grid.minor.y = element_blank(),
             panel.spacing.x = unit(4, "lines"),
             panel.spacing.y = unit(2, "lines"),
             legend.position = "top",
             legend.title = element_text(family = "Montserrat",
                                         color = "black",
                                         size = 14,
                                         margin = margin(5, 0, 5, 0)),
             legend.text = element_text(family = "Montserrat",
                                        color = "black",
                                        size = 11,
                                        margin = margin(4.5, 4.5, 4.5, 4.5)),
             legend.background = element_rect(fill = NA,
                                              color = NA),
             legend.key = element_rect(color = NA, fill = NA),
             #legend.key.width = unit(5, "lines"),
             #legend.spacing.x = unit(.05, "pt"),
             #legend.spacing.y = unit(.55, "pt"),
             #legend.margin = margin(0, 0, 10, 0),
             strip.text = element_text(face = "bold",
                                       margin = margin(b = 10)))

## theme settings for flipped plots
theme_flip <-
  theme(panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_line(size = .6,
                                          color = "#eaeaea"))

## theme settings for maps
theme_map <- 
  theme_void(base_family = "Montserrat") +
  theme(legend.direction = "horizontal",
        legend.box = "horizontal",
        legend.margin = margin(10, 10, 10, 10),
        legend.title = element_text(size = 17, 
                                    face = "bold"),
        legend.text = element_text(color = "grey33",
                                   size = 12),
        plot.margin = margin(15, 5, 15, 5),
        plot.title = element_text(face = "bold",
                                  size = 20,
                                  hjust = .5,
                                  margin = margin(30, 0, 10, 0)),
        plot.subtitle = element_text(face = "bold",
                                     color = "grey33",
                                     size = 17,
                                     hjust = .5,
                                     margin = margin(10, 0, -30, 0)),
        plot.caption = element_text(size = 14,
                                    color = "grey33",
                                    hjust = .97,
                                    margin = margin(-30, 0, 0, 0)))

## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)

## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")

## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")

## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))

## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)

Load data

youtube_data <- read_csv(here("proc_data","youtube_data_proc.csv"))
youtube_data_activities <- read_csv(here("proc_data","youtube_data_activities_proc.csv"))
tiktok_data <-  read_csv(here("proc_data","tiktok_data_proc.csv"))
tiktok_data_activities <-  read_csv(here("proc_data","tiktok_data_activities_proc.csv"))

Calculate general stats

yt_vids <- youtube_data %>% distinct(yt_video_id) %>% nrow()
tt_vids <- tiktok_data %>% distinct(tt_video_id) %>% nrow()
youtube_data_activities %>% group_by(yt_video_id) %>% summarise(idn=max(idea)) %>% 
   pull(idn) %>% {length(which(.>1))}-> mult_ideas_yt
tiktok_data_activities %>% group_by(tt_video_id) %>% summarise(idn=max(idea)) %>% 
   pull(idn) %>% {length(which(.>1))} -> mult_ideas_tt

meanytlength <- youtube_data$video_length %>% summary %>% {./60}
meanttlength <- tiktok_data$video_meta_duration %>% summary 

General stats

YouTube: 177 videos (unique video url identifiers, includes YT shorts)
TikTok: 177 videos

YouTube videos are longer (12.15141 minutes on average for the sampled videos), so approximately one third of the videos examined (53/177) included >1 money-making idea. TikTok videos have a shorter maximum length (3 to 10 minutes; 41.14384 seconds on average for the sampled videos) so videos on this platform tend to feature a single idea. Only 4 of the 145 TikTok videos examined provided more than one money-making idea.

Publication dates

youtube_data <- youtube_data %>% mutate(month=month(ymd(youtube_data$publish_date)),
                        pyear=year(ymd(youtube_data$publish_date))) %>% 
  mutate(pub_date=ymd(publish_date))


tiktok_data <- tiktok_data %>% mutate(month=month(ymd_hms(tiktok_data$create_time_iso)),                                pyear=year(ymd_hms(tiktok_data$create_time_iso))) %>%
  mutate(pub_date=date(ymd_hms(create_time_iso))) 

3/4 of the YouTube videos examined were published in 2022, and across all the videos sampled (published since 2018), most are from the summer/fall season (Northern Hemisphere).

TikTok videos in the sample were published between 2019-2022, with more videos uploaded with each passing year. The month with most uploads is July.

tiktok_data %>% tabyl(pyear) %>% round(2)
##  pyear  n percent
##   2019  4    0.03
##   2020 29    0.20
##   2021 50    0.34
##   2022 63    0.43

Publication month also varied between platforms.

youtube_data %>% count(month) %>% 
  ggplot()+
  geom_bar(aes(x=month,y=n),stat = "identity")+
  scale_x_discrete(limits=month.abb) +labs(subtitle = "YouTube data")

tiktok_data %>% count(month) %>% 
  ggplot()+
  geom_bar(aes(x=month,y=n),stat = "identity")+
  scale_x_discrete(limits=month.abb) +labs(subtitle = "TikTok data")

Considering publication dates, videos published earlier do not tend to accumulate more views and comments over time. Engagement is also mostly unrelated to subscriber/follower counts and thus possibly related to content.

ttdatevc <- tiktok_data %>% select(source,pub_date,
                       comments=comment_count,
                       views=play_count,
                       followers=author_meta_fans)
ytdatevc <- youtube_data %>% select(source,pub_date,
                       comments=comments,
                       views=view_count,
                       followers=subs_numeric)
dates_views_comments <- bind_rows(ttdatevc,ytdatevc)

ggplot(dates_views_comments)+
  geom_point(aes(x=pub_date,y=views,color=source))+
  labs(x="Publication date")

ggplot(dates_views_comments)+
  geom_point(aes(x=pub_date,y=comments,color=source))+
    labs(x="Publication date")

ggplot(dates_views_comments)+
  geom_point(aes(x=views,y=comments,color=source))

ggplot(dates_views_comments)+
  geom_point(aes(x=followers,y=comments,color=source))

dates_views_comments %>% filter(followers!=44100000) %>% 
ggplot()+
  geom_point(aes(x=followers,y=comments,color=source))+
  labs(subtitle = "removed outlier")

ggplot(dates_views_comments)+
  geom_point(aes(x=followers,y=views,color=source))

dates_views_comments %>% filter(followers!=44100000) %>% 
ggplot()+
  geom_point(aes(x=followers,y=views,color=source))+
  labs(subtitle = "removed outlier")

Presenter demographics

yt_presenter_demog_gend <- youtube_data %>% tabyl(presenter_gender) %>% 
  mutate(valid_percent=round(valid_percent,2))
yt_malepct <- yt_presenter_demog_gend$valid_percent[2] 
tt_presenter_demog_gend <- tiktok_data %>% tabyl(presenter_gender) %>% 
  mutate(valid_percent=round(valid_percent,2))
tt_malepct <- tt_presenter_demog_gend$valid_percent[2] 
yt_ages <- youtube_data %>% tabyl(presenter_age) %>% na.omit() %>% select(-percent) %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent)
tt_ages <- tiktok_data %>% tabyl(presenter_age) %>% na.omit() %>% select(-percent) %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent)

Male presenters were represented more on both platforms (YouTube: 0.86% and TikTok 0.8%), and the 20-30 y.o. age category had the highest proportion with ~40%.

gt(yt_ages) %>% tab_header("YouTube")
YouTube
presenter_age n percent
10 - 20 6 0.03
20 - 30 76 0.43
30 - 40 49 0.28
40 - 50 4 0.02
50+ 3 0.02
Voice-over 20 0.11
Voice-over Text-to-Speech 18 0.10
gt(tt_ages) %>% tab_header("TikTok")
TikTok
presenter_age n percent
10 - 20 14 0.10
20 - 30 57 0.40
30 - 40 24 0.17
40 - 50 5 0.04
50+ 2 0.01
Music 8 0.06
Voice-over 6 0.04
Voice-over Text-to-Speech 25 0.18

Categories

YouTube videos, as categorized by their authors, varied in assignment despite the similar overarching topic.

The most common category was Education, followed by How-to % Stlye, and then all the others.

youtube_data %>% tabyl(category) %>% arrange(-n) %>% 
  mutate(across(where(is.numeric),round,2)) %>% gt() %>% tab_header(title = "YouTube data")
YouTube data
category n percent
Education 96 0.54
Howto & Style 46 0.26
People & Blogs 27 0.15
Entertainment 7 0.04
News & Politics 1 0.01

Earnings data

ytearn <- 
youtube_data_activities %>% 
  group_by(yt_video_id,idea,earnings_timeframe) %>% 
  summarise(earn=mean(earnings,na.rm=TRUE)) %>% ungroup() %>% na.omit() %>% 
  filter(earnings_timeframe!="No timeframe provided")
ttearn <- 
  tiktok_data_activities %>% 
  group_by(tt_video_id,idea,earnings_timeframe) %>% 
  summarise(earn=mean(earnings,na.rm=TRUE)) %>% ungroup() %>% na.omit() %>% 
  filter(earnings_timeframe!="No timeframe provided")

# earnings time frames
earn_tf <- bind_rows(ytearn,ttearn) %>% count(earnings_timeframe) %>% arrange(-n)

earnings_by_tf <- 
bind_rows(ytearn,ttearn) %>% group_by(earnings_timeframe) %>% 
  summarize(median_earn=median(earn),
            min_earn=min(earn),max_earn=max(earn),
            sd_earn=sd(earn,na.rm = TRUE)) %>% arrange(-median_earn)
  • The most common time frame for earnings was for daily income, followed by months and hours.
gt(earn_tf)
earnings_timeframe n
Days 69
Months 66
Hours 56
One-time earnings 31
Minutes 21
Weeks 9
Years 5
Per Post 2
  • Longer time frames report higher median earnings.
gt(earnings_by_tf)
earnings_timeframe median_earn min_earn max_earn sd_earn
Years 100000.000 500.000 400000.00 155449.21533
Months 5375.000 15.000 300000.00 50663.54170
Weeks 1050.000 24.000 14000.00 4357.91659
Days 500.000 5.000 7000.00 1324.43451
Hours 40.000 3.000 487.85 109.77971
One-time earnings 30.000 1.000 1225.00 271.99917
Minutes 26.000 0.042 400.00 152.51315
Per Post 16.895 0.500 33.29 23.18603

Standardized earnings

temporal_earn <- c("Days","Hours","Minutes","Months","Weeks","Years")

yt_tempearn <- ytearn %>% filter(earnings_timeframe %in% temporal_earn)
tt_tempearn <- ttearn %>% filter(earnings_timeframe %in% temporal_earn)

yt_hourly_earn <- 
yt_tempearn %>% mutate(hourly_earn=case_when(
  earnings_timeframe=="Hours"~earn,
  earnings_timeframe=="Minutes"~earn/60,
  earnings_timeframe=="Days"~earn/8,
  earnings_timeframe=="Weeks"~earn/40,
  earnings_timeframe=="Months"~earn/200,
  earnings_timeframe=="Years"~earn/2400
)) %>% mutate(source="YouTube")

tt_hourly_earn <- 
  tt_tempearn %>% mutate(hourly_earn=case_when(
    earnings_timeframe=="Hours"~earn,
    earnings_timeframe=="Minutes"~earn/60,
    earnings_timeframe=="Days"~earn/8,
    earnings_timeframe=="Weeks"~earn/40,
    earnings_timeframe=="Months"~earn/200,
    earnings_timeframe=="Years"~earn/2400
  )) %>% mutate(source="TikTok")

all_earn <- bind_rows(yt_hourly_earn,tt_hourly_earn) 
hourly_med <- median(all_earn$hourly_earn)


bind_rows(yt_hourly_earn,tt_hourly_earn) %>% 
  ggplot()+
  geom_histogram(aes(hourly_earn,fill=source),color="black",alpha=0.5)

For videos that report earnings associated with a temporal reference ($ earned per unit of time), earnings can be reported in a common unit by assuming 8 hour work days and 5 day work weeks. The median hourly earnings is 37.5.

Across all videos, earnings are right-skewed. 90% of videos report hourly earnings > 275.

This distribution is also evident within earnings timeframes.

bind_rows(yt_hourly_earn,tt_hourly_earn) %>% 
  ggplot()+
  geom_histogram(aes(earn))+
  facet_wrap(~earnings_timeframe,scales = 'free')

Earnings by category (YouTube)

The more common categories (Education, Howto & Style) did not report the higher mean or median standardized earnings. Instead, the People and Blogs category and Entertainment had the top two positions.

yt_hourlycorrs <- left_join(yt_hourly_earn,youtube_data_activities)

yt_hourlycorrs_chp <- yt_hourlycorrs  %>% group_by(yt_video_id,idea) %>%
  chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()
yt_hourlycorrs_chp %>% 
  group_by(category) %>% summarise(mean_earn=mean(hourly_earn),
                                   med_earn=median(hourly_earn)) %>% 
  arrange(-mean_earn) %>% gt()
category mean_earn med_earn
People & Blogs 206.80750 75.00
Entertainment 127.05000 125.00
Howto & Style 90.84293 43.75
Education 68.85482 37.50
yt_hourlycorrs_chp %>% 
  ggplot(aes(x=category,y=hourly_earn,color=category))+
  geom_jitter() + scale_color_discrete(guide="none")

Business types and activities

YouTube

For all YouTube videos, the predominant Business Type for the money-making ideas was Publication, Media, and Blogs, followed by the Service Business. Other business types were less common.

# without earnings
yt_acts_chp <- youtube_data_activities  %>% group_by(yt_video_id,idea) %>%
  chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()

yt_bus1 <- youtube_data_activities  %>% group_by(yt_video_id,idea) %>%
  unchop(business_type_level_1) %>%  ungroup()

bus1ct <- yt_bus1 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_1) %>% 
  tabyl(business_type_level_1) %>% arrange(-n)

bus1ct %>% na.omit() %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>% 
  gt() %>% tab_header("YouTube",subtitle = "Business Types, all videos") 
YouTube
Business Types, all videos
business_type_level_1 n percent
Publication, Media & Blog 175 0.45
Service Business 122 0.31
Ecommerce & Consumer 56 0.14
Investing 27 0.07
Software & Tech 9 0.02


For videos and ideas with reported earnings, the business activity with the highest earnings (standardized) was Publication, Media, & Blog, followed by investing.

# with earninings

yt_hourlycorrs_bus1 <- yt_hourlycorrs  %>% group_by(yt_video_id,idea) %>%
  unchop(business_type_level_1) %>%  ungroup()

yt_hourlycorrs_bus1 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup() %>% 
  group_by(business_type_level_1) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-median_earn) %>% gt() %>% tab_header("YouTube",
                                                subtitle = "standardized hourly earning by business types")
YouTube
standardized hourly earning by business types
business_type_level_1 mean_earn median_earn
Publication, Media & Blog 123.72364 62.50000
Investing 31.20833 26.25000
Software & Tech 20.00000 20.00000
Service Business 35.67339 17.50000
Ecommerce & Consumer 48.28748 16.77083


However, there is considerable variation in earnings across the different business types

yt_hourlycorrs_bus1 %>% group_by(yt_video_id,idea) %>% 
  distinct(yt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup %>% 
  ggplot()+
  geom_jitter(aes(x=str_wrap(business_type_level_1,12),
                  y=hourly_earn,color=business_type_level_1))+
  scale_color_discrete(guide="none")+labs(x="Business Type (level 1)")

TikTok

For TikTok videos, the predominant Business Type for the money-making ideas was Service Business with almost 50% of videos, followed by the Ecommerce & Consumer ventures. Other business types were less common.

# tt without earnings
tt_acts_chp <- tiktok_data_activities  %>% group_by(tt_video_id,idea) %>%
  chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()

tt_bus1 <- tiktok_data_activities  %>% group_by(tt_video_id,idea) %>%
  unchop(business_type_level_1) %>%  ungroup()

tt_bus1ct <- tt_bus1 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_1)  %>% 
  tabyl(business_type_level_1) %>% arrange(-n)


tt_bus1ct %>% na.omit() %>% mutate(percent=round(valid_percent,2)) %>% select(-valid_percent) %>% 
  gt() %>% tab_header("TikTok",subtitle = "Business Types, all videos") 
TikTok
Business Types, all videos
business_type_level_1 n percent
Service Business 74 0.49
Ecommerce & Consumer 36 0.24
Publication, Media & Blog 28 0.19
Investing 12 0.08
Software & Tech 1 0.01

For videos and ideas with reported earnings, the business activity with the highest earnings (standardized) was Investing, followed by Ecommerce & Consumer

# tt with earninings
tt_hourlycorrs <- left_join(tt_hourly_earn,tiktok_data_activities)

tt_hourlycorrs_chp <- tt_hourlycorrs  %>% group_by(tt_video_id,idea) %>%
  chop(business_type_level_1) %>% chop(business_type_level_2) %>% chop(skills_required) %>% ungroup()

tt_hourlycorrs_bus1 <- tt_hourlycorrs  %>% group_by(tt_video_id,idea) %>%
  unchop(business_type_level_1) %>%  ungroup()

tt_hourlycorrs_bus1 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup() %>% 
  group_by(business_type_level_1) %>% 
  summarise(mean_earn=mean(hourly_earn),
            median_earn=median(hourly_earn)) %>% 
  arrange(-median_earn) %>% gt()
business_type_level_1 mean_earn median_earn
Investing 230.37097 135.0000
Ecommerce & Consumer 215.47273 109.9006
Publication, Media & Blog 221.63810 31.2500
Service Business 58.37537 25.0000
Software & Tech 1.48500 1.4850

However, with some exceptions, earnings do not vary considerably across the different business types

tt_hourlycorrs_bus1 %>% group_by(tt_video_id,idea) %>% 
  distinct(tt_video_id,idea,business_type_level_1,hourly_earn) %>% ungroup %>% 
  ggplot()+
  geom_jitter(aes(x=str_wrap(business_type_level_1,12),
                  y=hourly_earn,color=business_type_level_1))+
  scale_color_discrete(guide="none")+labs(x="Business Type (level 1)",subtitle = "TikTok")

# 1 saving plots in pdf with example

# ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color= drv)) + 
#      geom_smooth(mapping = aes(linetype = drv), method = 'loess') +
#      geom_point()
# 
# if(save == T){ 
#   ggsave(here::here("plots", "name_plot.pdf"), 
#          width = 12.5, height = 8, device = cairo_pdf)
# }

# 2 pdfs will then be converted into the pngs using the 04_convert_pdfs_to_pngs.rmd file. 

View counts, comments, followers, and standardized earnings are not tightly associated.

yt_hourly_renamed <-  yt_hourlycorrs_chp %>%  
  select(earn=hourly_earn,views=view_count,source,comments,followers=subs_numeric)
  
tt_hourly_renamed <-  tt_hourlycorrs_chp %>%  
  select(earn=hourly_earn,views=play_count,source,comments=comment_count,
           followers=author_meta_fans)

hourlyboth <- bind_rows(yt_hourly_renamed,tt_hourly_renamed)

ggplot(hourlyboth)+aes(x=views,y=earn,color=source)+geom_point()

ggplot(hourlyboth)+aes(x=comments,y=earn,color=source)+geom_point()

ggplot(hourlyboth)+aes(x=followers,y=earn,color=source)+geom_point()

Video titles

In general, the video titles vary considerably across platforms in terms of length, content and style.

tiktok_data <-  tiktok_data %>% mutate(title_noHash=str_extract(text,"^[^#]*")) 
yt_tlength <- round(mean(str_length(youtube_data$title)),0)
tt_tlength <-round(mean(str_length(tiktok_data$text)))
tt_tlength_nh <-round(mean(str_length(tiktok_data$title_noHash)))

Without various trailing hashtags, YouTube video titles are on average, twice as long as TikTok titles (65 vs. 31 characters). Overall, roughly a third of the length of TikTok titles comprises various hashtags.

The words and bigrams (consecutive sequences of two words) that appear most frequently in the video’s titles vary significantly between platforms.

# tokenize 
stopwords <- c("for","in","a","the","to","with","from","by")
title_words_yt <- youtube_data %>% unnest_tokens(title_wrd,title,token = "words") %>% 
  filter(!title_wrd %in% stopwords)
title_bigrams_yt <- youtube_data %>% unnest_tokens(title_bg,title,token = "ngrams",n=2) 
title_words_tt <- tiktok_data %>% unnest_tokens(title_wrd,text,token = "tweets") %>% 
  filter(!title_wrd %in% stopwords)
title_bigrams_tt <- title_words_tt %>% mutate(nextwrdbg=lead(title_wrd)) %>% 
  unite(title_bg, title_wrd, nextwrdbg, sep = ' ')


wordsyt <- title_words_yt %>% count(title_wrd) %>% slice_max(n,n=15) %>% mutate(source="YouTube")
wordstt <- title_words_tt %>% count(title_wrd) %>% slice_max(n,n=15) %>% mutate(source="TikTok")
bg_yt <- title_bigrams_yt %>% count(title_bg) %>% slice_max(n,n=15) %>% mutate(source="YouTube")
bg_tt <- title_bigrams_tt %>% count(title_bg) %>% slice_max(n,n=15) %>% mutate(source="TikTok")
top15wrds <- bind_rows(wordsyt,wordstt)
top15bg <- bind_rows(bg_yt,bg_tt)

ggplot(top15wrds)+
  geom_lollipop(aes(x=fct_reorder(title_wrd,n),y=n))+
  facet_wrap(~source)+labs(x="word or hashtag",y='occurrences')+
  coord_flip()

ggplot(top15bg)+
  geom_lollipop(aes(x=fct_reorder(title_bg,n),y=n))+
  facet_wrap(~source)+labs(x="bigram",y='occurrences')+
  coord_flip()

Considering the top 15 words or bigrams, there is little overlap between platforms.